suppressMessages(suppressWarnings(library(dplyr)))
suppressMessages(suppressWarnings(library(ggplot2)))
suppressMessages(suppressWarnings(library(tidyr)))
suppressMessages(suppressWarnings(library(ggridges)))
suppressMessages(suppressWarnings(library(gridExtra)))
suppressMessages(suppressWarnings(library(tidytext)))
suppressMessages(suppressWarnings(library(reshape2)))
suppressMessages(suppressWarnings(library(treemapify)))
suppressMessages(suppressWarnings(library(corrplot)))
suppressMessages(suppressWarnings(library(stringr)))
data <- read.csv("data/train.tsv", row.names = NULL, sep = "\t")
First, a look at the types of the columns:
str(data)
## 'data.frame': 1482535 obs. of 8 variables:
## $ train_id : int 0 1 2 3 4 5 6 7 8 9 ...
## $ name : Factor w/ 1225273 levels "________ON HOLD TILL TUES_________",..: 725736 936774 151110 586228 57500 263846 100403 444406 446083 906076 ...
## $ item_condition_id: int 3 3 1 1 1 3 3 3 3 3 ...
## $ category_name : Factor w/ 1288 levels "","Beauty/Bath & Body/Bath",..: 810 88 1256 487 1183 1195 1255 888 888 1024 ...
## $ brand_name : Factor w/ 4810 levels "","!iT Jeans",..: 1 3616 4243 1 1 1 52 4029 3111 1 ...
## $ price : num 10 52 10 35 44 59 64 6 19 8 ...
## $ shipping : int 1 0 1 1 0 0 0 1 0 0 ...
## $ item_description : Factor w/ 1281427 levels "","️️ ❤️BRAND NEW\u274cPrice is firm ❤️Expiration Date:3 years. 100% ORIGINAL BELLA CREAM.... BREAST AND BUTT ENHANCER"| __truncated__,..: 823522 1135769 132062 795038 391645 176183 1020295 1275882 498628 576888 ...
name, brand_name and item_description are all stored as factors, when it makes more sense to store these as characters, given their high cardinality. Also, I hate working with factors in R.
data$name <- as.character(data$name)
data$brand_name <- as.character(data$brand_name)
data$item_description <- as.character(data$item_description)
data$category_name <- as.character(data$category_name)
Before I start, I’ll check if there are any clearly faulty prices.
data %>%
ggplot(aes(x = price)) +
geom_density()
data %>%
arrange(desc(price)) %>%
head(10)
The distribution of prices looks believable. The most expensive items cost around $2,000, and they are almost all designer jewlery or handbags, which is believable.
How many missing values does each column have?
lapply(data, function(v) sum(is.na(v)))
## $train_id
## [1] 0
##
## $name
## [1] 0
##
## $item_condition_id
## [1] 0
##
## $category_name
## [1] 0
##
## $brand_name
## [1] 0
##
## $price
## [1] 0
##
## $shipping
## [1] 0
##
## $item_description
## [1] 0
There are no NA values in the dataset, but I know that not all the columns are complete from inspecting the data. I suspect that some of the empty values are encoded as the empty string ’’.
# convert the empty string to native NA
data <- data %>%
mutate(name = ifelse(name == "", NA, name),
brand_name = ifelse(brand_name == "", NA, brand_name),
item_description = ifelse(item_description == "", NA, item_description),
category_name = ifelse(category_name == "", NA, category_name))
# what percentage of the data is missing for each column?
lapply(data, function(v) mean(is.na(v)))
## $train_id
## [1] 0
##
## $name
## [1] 0
##
## $item_condition_id
## [1] 0
##
## $category_name
## [1] 0.00426769
##
## $brand_name
## [1] 0.4267569
##
## $price
## [1] 0
##
## $shipping
## [1] 0
##
## $item_description
## [1] 2.698081e-06
This makes more sense - now we see that around 42% of the brands are not specified, and a small portion of the data does not have a category or an item description.
Now, I’ll take a closer look at each of the columns in isolation to get a feel for the content of the data. Later I’ll consider multivariate properties.
Taking a look at the categories stored in the data:
data %>%
count(category_name, sort = TRUE) %>%
rename(frequency = n)
The categories seem to be a backslash (/) delimited list of categories, with the first word being most general (e.g. Women, Men, Beauty) and the last word most specific (Shorts, Necklaces, T-Shirts).
The first thing I can do is split up this column into three different subcategories in the category heirarchy:
# split the category into a hierary
data <- data %>%
separate(col = category_name,
into = c("high_category", "mid_category", "low_category"),
sep = "/",
remove = FALSE)
## Warning: Too many values at 4389 locations: 240, 743, 1701, 2829, 2924,
## 3395, 3817, 3881, 4160, 4288, 4784, 5469, 5652, 5820, 6570, 6732, 6901,
## 7110, 7217, 7735, ...
Now checking out the frequency of our new categories:
data %>%
group_by(high_category) %>%
summarise(frequency = n(),
avg.price = mean(price),
price.std = sqrt(var(price))) %>%
arrange(desc(frequency))
data %>%
ggplot(aes(x = price, y = high_category, fill = high_category)) +
geom_density_ridges() +
theme(legend.position = "")
## Picking joint bandwidth of 1.3
Taking a look at this on the log-scale:
data %>%
ggplot(aes(x = price, fill = high_category)) +
theme(legend.position = "") +
geom_density() +
facet_wrap(~high_category) +
scale_x_log10() +
labs(title = "Distribution of prices over High-Categories - log-scale")
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 874 rows containing non-finite values (stat_density).
It looks like the overall distribution of prices is roughly the same across the high-level categories. This doesn’t tell us how much the prices vary within in a mid-level category within each high-level category.
To see that, I’ll do the following:
data %>%
group_by(high_category) %>%
mutate(unique.mid_category = n_distinct(mid_category)) %>%
ungroup() %>%
group_by(high_category, mid_category) %>%
mutate(avg.price.mid_category = mean(price),
stddev.price.mid_category = sqrt(var(price))) %>%
select(high_category,mid_category, price, avg.price.mid_category, stddev.price.mid_category,unique.mid_category) %>%
group_by(high_category) %>%
summarize(avg.price.range = max(avg.price.mid_category) - min(avg.price.mid_category),
stddev.price.range = max(stddev.price.mid_category) - min(stddev.price.mid_category),
unique.mid_category = first(unique.mid_category)) %>%
arrange(desc(avg.price.range))
So the average prices of the mid-level categories within the Electronics and Vintage & Collectibles have the biggest range. Plotting these within-category distributions:
# a function for making said plot
tmp <- function(high){
data %>%
filter(high_category == high) %>%
ggplot(aes(x = price, fill = mid_category)) +
geom_density() +
scale_x_log10() +
facet_wrap(~mid_category) +
labs(title = paste("Distribution of prices within '", high,"' high-level category")) +
theme(legend.position = "")
}
tmp("Electronics")
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 58 rows containing non-finite values (stat_density).
It lookds like within the Electronics subcategory, Car Audio, Video & GPS is on one extreme of prices, while Media is on the inexpensive extreme. But in general, the change in shape is not drastic.
tmp("Vintage & Collectibles")
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 11 rows containing non-finite values (stat_density).
Again, the difference in the shape of the distributions across the mid-level in the Vintage & Collectibles high-level category is not extreme.
# how many unique low-level categories?
data %>%
count(low_category, sort = TRUE) %>%
rename(frequency = n)
There are 871 unique low-level categories. Some of them are very esoteric - with only a few dozen (or only one) listing with that low-level category.
# esoteric low-level categories first
data %>% count(low_category) %>%
rename(frequency = n) %>%
arrange(frequency)
Thinking down the line, this is going to be problematic to use as input to most machine learning algorithm. The low-level category of a listing is a categorical variable, and so to encode it we’ll have to use a “one-hot” encoding scheme. This will increase the dimensionality of our datset substancially, as we’ll have to add 871 variables to encode the low-level category of each variable.
A possible solution to this is to bin our low-level categories in some meaningful way (more on this to come TODO). I’ll probably use some variation of splitting the low-level categories into quantiles based on their average prices.
But before I do so, I’ll need to make sure I’m not mixing up data from low-level categories across mid and high-level categories. To be sure that I’m not doing that, I’ll need to see if low-level categories are distinct amongst mid/high level categories or not:
data %>%
group_by(low_category) %>%
summarize(distinct.mid = n_distinct(mid_category),
distinct.high = n_distinct(high_category)) %>%
arrange(desc(distinct.mid))
So, low-level categories are not distinct amongst high/mid-level categories. I’ll have to keep this in mind.
The cardinality of low-level categories is very high. But how rare are some of the less frequently occuring categories?
data %>%
group_by(low_category) %>%
summarise(frequency = n()) %>%
ungroup() %>%
mutate(proportion = frequency/sum(frequency)) %>%
arrange(desc(frequency)) %>%
mutate(cum.proportion = cumsum(proportion),
frequency.rank = row_number()) %>%
filter(cum.proportion <= .81) %>%
arrange(desc(cum.proportion))
The top 100 low-level categories (out of ~870) account for 80% of the data. This may make things easier if we want to bin the low-level categories later for dimensionality reduction.
data %>%
group_by(mid_category) %>%
summarise(frequency = n()) %>%
ungroup() %>%
mutate(proportion = frequency/sum(frequency)) %>%
arrange(desc(frequency)) %>%
mutate(cum.proportion = cumsum(proportion),
frequency.rank = row_number(),
total.mid_categories = n()) %>%
filter(cum.proportion <= .81) %>%
arrange(desc(cum.proportion))
The top 30 mid-level categories (out of 114) accont for 80% of the data.
I wonder if there are duplicate categories - perhaps due to letter caseing:
# getting the number of low-level categories,
# grouped by lowercased low-level category
data %>%
mutate(lowercase.category = str_to_lower(low_category)) %>%
group_by(lowercase.category) %>%
summarize(num.categories = n_distinct(low_category)) %>%
arrange(desc(num.categories))
# what are the different variations of t-shirts?
data %>%
filter(str_to_lower(low_category) == "t-shirts") %>%
count(low_category)
The only low-level category that would benefit from case normalization is that of T-Shirts.
# getting the number of mid-level categories,
# grouped by lowercased mid-level category
data %>%
mutate(lowercase.category = str_to_lower(mid_category)) %>%
group_by(lowercase.category) %>%
summarize(num.categories = n_distinct(mid_category)) %>%
arrange(desc(num.categories))
Mid-level categories would not benefit from case-normalization.
The brand of a product will undoubtedly be an important factor determining the price, and so it’s important to check for any noise/inconsistencies in this column:
data %>%
group_by(brand_name) %>%
summarize(frequency = n(),
avg.price = mean(price),
price.stddev = sqrt(var(price))) %>%
arrange(desc(frequency))
Many items do not have brands listed (42%). Is that a signal for the price of the item?
data %>%
mutate(contains.brand = !is.na(brand_name)) %>%
ggplot(aes(x = contains.brand, y = price, fill = contains.brand)) +
geom_boxplot() +
scale_y_log10() +
coord_flip()
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 874 rows containing non-finite values (stat_boxplot).
It looks like listings that contain brands tend to have higher prices on average. This will likely be a useful feature.
There are 4,809 unique brands. Again, this will get us into dimensionality trouble when it comes to machine learning. If possible, we should bin brands together.
data %>%
group_by(brand_name) %>%
summarise(frequency = n()) %>%
arrange(desc(frequency)) %>%
mutate(brand.number = row_number()) %>%
ggplot(aes(x = brand.number, y = frequency)) +
geom_col() +
scale_y_log10() +
ylab("log(Frequency)") +
labs(title = "Brand Frequency - log-scale")
data %>%
group_by(brand_name) %>%
summarise(frequency = n()) %>%
ungroup() %>%
mutate(proportion = frequency/sum(frequency)) %>%
arrange(desc(frequency)) %>%
mutate(cum.proportion = cumsum(proportion),
frequency.rank = row_number()) %>%
filter(cum.proportion <= .81) %>%
arrange(desc(cum.proportion))
The frequency of the different classes folows a strong power rule - the top 70 most frequently occuring brands account for account for 80% of the data.
Again, checking if the levels of this variable would benefit from case normalization:
# getting the number of low-level categories,
# grouped by lowercased low-level category
data %>%
mutate(lowercase.brand = str_to_lower(brand_name)) %>%
group_by(brand_name) %>%
summarize(num.categories = n_distinct(lowercase.brand)) %>%
arrange(desc(num.categories))
They would not.
What are the most eexpensive brands?
data %>%
group_by(brand_name) %>%
summarize(avg.price = mean(price),
frequency = n(),
num.categories = n_distinct(high_category)) %>%
filter(frequency > 1000) %>%
arrange(desc(avg.price))
Not surprisingly, the most expensive brands are designer clothing/jewlery and well known electronics companies, such as Apple, and Beats by Dr. Dre.
I’m surprised, however, that these top brands have items in so many high-level categories. For example, Air-Jordan makes Basketball shoes, but there are items listed under this brand in 5 high-level categories!
Looking at the relative frequencies of the high-level categories in the top-20 most expensive brands (that have 1000+ listings):
data %>%
inner_join(
data %>%
group_by(brand_name) %>%
summarize(avg.price = mean(price),
frequency = n(),
num.categories = n_distinct(high_category)) %>%
filter(frequency >= 1000) %>%
top_n(20, avg.price),
on = c("brand_name" = "brand_name")
) %>%
group_by(brand_name, high_category) %>%
summarize(avg.price = mean(price),
frequency = n()) %>%
arrange(brand_name) %>%
ggplot(aes(x = high_category, y = frequency, fill = high_category, label = high_category)) +
geom_col() +
facet_wrap(~brand_name, scales = "free") +
theme(axis.text.x = element_blank(),
axis.text.y = element_blank()) +
theme(legend.position = "") +
coord_flip() +
geom_text( size = 2)
## Joining, by = "brand_name"
## Warning: Removed 18 rows containing missing values (geom_text).
Another way to visualize the proportion of products in each item category for each brand is with a treemap:
data %>%
inner_join(
data %>%
group_by(brand_name) %>%
summarize(avg.price = mean(price),
frequency = n(),
num.categories = n_distinct(high_category)) %>%
filter(frequency >= 1000) %>%
top_n(15, avg.price),
on = c("brand_name" = "brand_name")
) %>%
group_by(brand_name, high_category) %>%
summarize(avg.price = mean(price),
frequency = n()) %>%
ungroup() %>%
ggplot(aes(area = frequency, label = high_category, subgroup = brand_name, fill = high_category)) +
geom_treemap() +
geom_treemap_subgroup_border() +
geom_treemap_subgroup_text(place = "centre", grow = T, alpha = 0.5, colour =
"black", fontface = "italic", min.size = 0) +
geom_treemap_text(colour = "white", place = "topleft", reflow = T) +
theme(legend.position = "null")
## Joining, by = "brand_name"
## Warning: Removed 13 rows containing missing values (geom_treemap_text).
Here, each dark rectangle is a brand, and each sub-rectangle is a diferent itme category. Rectangles with one solid color like that of Apple and Air Jordan represent comanies whose offerings are primarly focused in one category, while rectangles that are split up like that of Gucci and Chanel are companies that offer products across categories.
We can see that for these top brands, the majority of their items are in the same high-level categories. For example, 99.4% of Samsung’s listings are under the category “Electronics”, while the remaining are scattered amongst 5 other categories.
I think this is an opportunity to consolidate the high-level categories some. For brands that have an overwhelming majorityof items in one high-level category, perhaps it makes more sense to convert all the categories to the most frequenlty occuring category. If this is the case, I could also flag if an item’s high-level category is not the brand’s core-competency - as a feature.
As a side note - I notice from this graph that “Jordan” and “Air-Jordan” are seperate brands. Same goes for “Beats” and “Beats by Dr. Dre”. I’ll quickly merge these brands into one.
data = data %>%
mutate(brand_name = case_when(
brand_name == "Air Jordan" ~ "Jordan",
brand_name == "Beats by Dr. Dre" ~ "Beats",
TRUE ~ brand_name)
)
Mercari is a marketplace for buying/selling used goods. In such an environment, it’s clear that the condition of an item will be important for the asking price.
data %>% group_by(item_condition_id) %>%
summarize(frequency = n(),
avg.price = mean(price))
There are 5 levels for item-condition_id. From the average price, it’s not clear if these levels have an inherent ordering (e.g. 5 == “new”, 1 == “bad condition”)
To figure this out, I’ll have to read some descriptions!
# a sample of descriptions where `item_condition_id` is 1.
set.seed(1)
data %>%
filter(item_condition_id == 1) %>%
sample_n(50) %>%
select(name, item_description)
Reading the It seems like when item_condition_id is equal to one, the item is new. For example, the second rating I see here is “Brand new sealed ps4”.
# a sample of descriptions where `item_condition_id` is 5.
set.seed(1)
data %>%
filter(item_condition_id == 5) %>%
sample_n(50) %>%
select(name, item_description)
When item_condition_id is equal to five, the item seems to be broken or only paritally complete. For example, one of the descriptions in this sample is: “Comes with booklet case and game but it doesn’t work. Tested. Maybe it needs resurfacing.”
I can try and make this a bit more concrete by looking at which words appear more frequenlty in postings of each of the different item conditions, compared to the global frequency. This is a bit hard to follow, so to be clear about what I’m going to compute:
set.seed(1)
tmp = data %>%
sample_n(100000) %>%
mutate(num.postings = n()) %>% # total number of postings
group_by(item_condition_id) %>%
mutate(num.postings.condition = n()) %>% # number of postings in each of the conditions
ungroup() %>%
unnest_tokens(word, item_description) %>%
anti_join(stop_words) %>% # get rid of stopwords
group_by(word) %>%
mutate(postings.word = n_distinct(train_id)) %>% # how many postings have each word?
group_by(word, item_condition_id) %>%
summarise(postings.word.condition = n_distinct(train_id), # how many postigns of each item condition have each word?
postings.word = first(postings.word),
num.postings = first(num.postings),
num.postings.condition = first(num.postings.condition)) %>%
mutate(post.proportion.with.word = postings.word/num.postings, # proportion of postings with each word
post.condition.proportion.with.word = postings.word.condition/num.postings.condition) %>% # proportion of postings within each condition with each word
mutate(proportion.difference = post.condition.proportion.with.word - post.proportion.with.word) %>% # difference in said proportions
ungroup() %>%
filter(postings.word > 100) %>% # only keep the words that appear somewhat frequently
select(word,item_condition_id, post.proportion.with.word, post.condition.proportion.with.word,proportion.difference) %>%
arrange(desc(proportion.difference))
## Joining, by = "word"
tmp %>%
filter(item_condition_id == 5) %>%
arrange(desc(proportion.difference)) %>%
filter(row_number() <= 20) %>%
select(-proportion.difference) %>%
melt(id.vars = c("word", "item_condition_id")) %>%
ggplot(aes(x = word, y = value, fill = variable)) +
geom_col(position = "dodge") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
coord_flip() +
ylab("Proportion of postings with word") +
labs(title = "Relative proportion of postings with words",
subtitle = "All postings vs. postings with item_condition_id = 5")
In postings where item_condition_id = 5, words like broken, missing, scuffs, screen and missing appear much more frequently than in other postings. This really makes it clear that the condition of items with this item_condition_id are poor.
tmp %>%
filter(item_condition_id == 1) %>%
arrange(desc(proportion.difference)) %>%
filter(row_number() <= 20) %>%
select(-proportion.difference) %>%
melt(id.vars = c("word", "item_condition_id")) %>%
ggplot(aes(x = word, y = value, fill = variable)) +
geom_col(position = "dodge") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
coord_flip() +
ylab("Proportion of postings with word")+
labs(title = "Relative proportion of postings with words",
subtitle = "All postings vs. postings with item_condition_id = 1")
In postings where item_condition_id = 1, words like tags, sealed, brand and box are more frequently occuring than in other postings, indicating that indeed this condition id is of newer items.
tmp %>%
filter(item_condition_id == 3) %>%
arrange(desc(proportion.difference)) %>%
filter(row_number() <= 20) %>%
select(-proportion.difference) %>%
melt(id.vars = c("word", "item_condition_id")) %>%
ggplot(aes(x = word, y = value, fill = variable)) +
geom_col(position = "dodge") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
coord_flip() +
ylab("Proportion of postings with word")+
labs(title = "Relative proportion of postings with words",
subtitle = "All postings vs. postings with item_condition_id = 3")
We can see that some of the words that appear in postings where item_condition_id = 3 indicate that these items are in poor condition, with words such as flaws, holes, rips, worn and stains appearing more frequently than in the rest of the postings. However, the word excellent appears more frequently than in in all the postings (in the aggreagate).
It seems like there is an inherent ordering in the values of item_condition_id after all: \[
\text{Condition of item}(i) < \text{Condition of item}(j) \\
\text{if }\quad \text{item_condition_id}_i > \text{item_condition_id}_j
\]
Therefore, it wouldnt be proposterous to use the variable item_condition_id as-is in a machine learning regressor. However, if we do so, then we are asserting that the difference in quality between items with item_condition_id = 1 and item_condition_id = 2 is the same as the difference in quality between items with item_condition_id = 2 and item_condition_id = 3, and so on.
We are definitely not justified to make this assertion! Just becasue we know there is an ordering to the condition id’s, it does not mean we can assign a magnitude to these differences. Therefore, it will make more sense to encode this variable with a one-hot encoding, and use it as a categorical predictor come prediction time, as opposed to a numeric predictor.
data %>%
ggplot(aes(x = factor(item_condition_id), y = price, fill = factor(item_condition_id))) +
geom_boxplot() +
scale_y_log10()+
coord_flip() +
theme(legend.position = "")
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 874 rows containing non-finite values (stat_boxplot).
After all this, it’s strange that we don’t see high prices in general for each of the different condition levels. It’s expecially bizzare that the median price is highest in items of item_condition_id equal to 5.
data %>%
group_by(item_condition_id, high_category) %>%
summarize(frequency = n(),
avg.price = mean(price)) %>%
ggplot(aes(x = high_category, y = frequency, fill = factor(item_condition_id))) +
geom_col() +
facet_grid(item_condition_id~.) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(legend.position = "")
data %>%
group_by(item_condition_id, high_category) %>%
summarize(frequency = n(),
avg.price = mean(price)) %>%
group_by(item_condition_id) %>%
mutate(max_frequency = frequency == max(frequency),
proportion.of.items = frequency/sum(frequency),
items.with.condition = sum(frequency)) %>%
filter(max_frequency) %>%
rename(most.common.category = high_category) %>%
select(item_condition_id,items.with.condition,most.common.category, proportion.of.items, avg.price)
Oh, I see. There are just much fewer products with item_condition_id equal to 4 and 5. Furthermore, the majority of the items with item_condition_id equal to one are of the Electronics category, while the majority of items with item_condition_id equal to 1 are of category Women.
Women’s products of poor condiition (item_description_id equal to 4 or 5) are not very valuable (next chart). However, used/damaged electronics in this dataset appear to be more expensive than new electronics.
data %>%
filter(high_category %in% c("Electronics", "Women"),
price > 0) %>%
ggplot(aes(x = high_category, y = price, fill = high_category)) +
geom_boxplot() +
scale_y_log10()+
coord_flip() +
facet_grid(item_condition_id~.) +
labs(title = "Prices of Women's and Electronics devies, across condition levels",
subtitle = "Used to explain why the average price of items of poor condition is higher than\nthat of items with good conditions.") +
theme(legend.position = "") +
ylab("price (log scale)") +
xlab("High-level Product Category")
The two charts above also highlights something that’s strange - amongst the few electronics items sold with item_condition_id equal to 5, the prices are genearly than the electronics items with item_condition_id equal to 1. Why might that be?
WARNING - SPECULATION! One hypothesis is people sell smaller, cheaper items new, and don’t bother selling them when they’re damaged, as they are worth so little. Larger items, such as Macbooks and iPhones, hold more value even when they’re damaged, and so they are sold whenthey are damaged more frequently, which may explain why electronic items of poor condition are more expensive.
** TODO - Look into that!**
How many items provide shipping?
data %>%
group_by() %>%
summarize(shipping = sum(shipping),
num.postings = n()) %>%
mutate(no.shipping = num.postings - shipping) %>%
mutate(no.shipping = no.shipping/num.postings,
shipping = shipping/num.postings) %>%
select(shipping, no.shipping) %>%
melt() %>%
ggplot(aes(x = variable, y = value, fill = variable, label = value )) +
geom_col() +
theme(axis.text.x = element_blank(),
axis.text.y = element_blank()) +
geom_text(vjust = -.5)
## No id variables; using all as measure variables
Pretty even split amongst postings that provide shipping and those who dont.
data %>%
group_by(item_condition_id) %>%
summarize(shipping = sum(shipping),
num.postings = n()) %>%
mutate(no.shipping = num.postings - shipping) %>%
mutate(no.shipping = no.shipping/num.postings,
shipping = shipping/num.postings) %>%
select(item_condition_id, shipping, no.shipping) %>%
melt(id.vars = c("item_condition_id")) %>%
ggplot(aes(x = variable, y = value, fill = variable)) +
geom_col() +
facet_grid(item_condition_id~.) +
coord_flip() +
ylab("Proportion of postings (of given condition)") +
xlab(NULL) +
theme(axis.text.y = element_blank()) +
labs(title = "Proportion of postings that (don't) provide shipping, split by product condition")
We can see that for all products of all item conditions except 1 (new items), the majority of postings do not provide shipping. The overall proportion of postings that provide shipping is more equally balanced with those that do not because the global count of postings of condition 1 is larger than the rest - which the chart of proportions does not show.
Looking at frequencies instead:
data %>%
group_by(item_condition_id) %>%
summarize(shipping = sum(shipping),
num.postings = n()) %>%
mutate(no.shipping = num.postings - shipping) %>%
select(item_condition_id, shipping, no.shipping) %>%
melt(id.vars = c("item_condition_id")) %>%
ggplot(aes(x = variable, y = value, fill = variable)) +
geom_col() +
facet_grid(item_condition_id~.) +
coord_flip() +
ylab("Frequency of postings (of given condition)") +
xlab(NULL) +
theme(axis.text.y = element_blank()) +
labs(title = "Frequency of postings that (don't) provide shipping, split by product condition")
There it is.
How does this affect prices?
data %>%
mutate(shipping = ifelse(shipping == 0, "no", "yes")) %>%
ggplot(aes(x = shipping, y = price, fill = shipping)) +
geom_boxplot(show.legend = FALSE) +
scale_y_log10() +
xlab("Provide shipping?") +
coord_flip()
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 874 rows containing non-finite values (stat_boxplot).
In the aggregate, it looks like items that do not have shipping are more expensive than those that do. But we already identfied that there are lurking variables (condition, and probably item category), and so we should be looking at more segmented data.
data %>%
mutate(shipping = ifelse(shipping == 0, "no", "yes")) %>%
ggplot(aes(x = shipping, y = price, fill = shipping)) +
geom_boxplot(show.legend = FALSE) +
scale_y_log10() +
xlab("Provide shipping?") +
ylab("Price (log-scale)") +
coord_flip() +
facet_grid(item_condition_id~.)
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 874 rows containing non-finite values (stat_boxplot).
So we know that whether or not shipping is provided is dependent on the item’s quality. How about the category of the itm?
data %>%
group_by(high_category, item_condition_id) %>%
summarize(shipping = sum(shipping),
num.postings = n()) %>%
mutate(no.shipping = num.postings - shipping) %>%
mutate(shipping = shipping/num.postings,
no.shipping = no.shipping/num.postings) %>%
mutate(lab = paste("Num postings:", num.postings, "\nPerc. shipping:", round(shipping, 4))) %>%
ggplot(aes(x = high_category, y = item_condition_id, fill = shipping, label = lab)) +
geom_tile() +
geom_text() +
scale_fill_distiller(palette = "Spectral", direction = 1)+
ggtitle("Percentage of postings that provide shipping, grouped by category and condition")
Here, the color of each tile represents the proportion of postings of that type that provide shipping (cooler = more postings provide shipping.
As we’ver seen, items of better condition (smaller item_condition_id) tend to provide shipping more frequently. This is evident in that the bottom row is overall “cooler” than the rest of the rows.
We also see that some product categories are more likely to procde shipping than others. For example, brand new electronics items provide free shipping quite frequently. Regardless of item quality, it seems that items of category Vintage & Collectibles and Beauty tend to provide shipping frequently. On the other hand, items of category Home very rarely provide shipping - especially used/damaged home goods.
For some item categories, the liklihood of the vendor providing shipping depends very much on the item condition. For example, amongst items of the Handmade category, 71.29% of new items come with free shipping, while only 38.01% of used items (item_condition_id equal to 3) come with shipping.
data %>%
mutate(shipping = ifelse(shipping == 1, "shipping", "no.shipping")) %>%
group_by(high_category, item_condition_id, shipping) %>%
summarize(num.postings = n(),
avg.price = mean(price)) %>%
mutate(lab = paste("num\npostings:", num.postings, sep = "\n")) %>%
ggplot(aes(x = high_category, y = item_condition_id, fill = avg.price, label = lab)) +
geom_tile() +
geom_text() +
facet_wrap(~shipping) +
scale_fill_distiller(palette = "Spectral", direction = 1) +
labs(title = "Average prices, split by shipping, item condition, and category.") +
theme(axis.text.x = element_text(angle = 20, hjust = 1))
Now I’ve shown the averge prices of items, split product category, item condition, and whether or not shipping is provided.
In general, we see again that items that are more expensive tend to provide shipping more frequently (the right box is more red). Interestingly enough, we can see electronics items that are only slightly used (item_condition_id equal to 2) are much less expensive than new electronics items - for both items that do/don’t provide shipping. This may indidcate that electronics items lose their value quickly as soon as they have been sold (which is why refurbished goods are so much cheaper than new ones).
For some categories, such as Beauty and Sports & Outdoors, it seems like the average price may increase and the condition worsens. **A word of warning: the data gets rather sparse for items of poor condition (because fewer items are sold in poor condition than in good condition), so these averages may be misleading. Assuming that these averages are reliable, however, a possible explanation is that for these “less luxurious” categories, such as Sports & Outdoors*, if one is to bother selling his/her goods damaged, it’s probably because that good originally had a lot of value, and so it still retains enough value to be expensive compared to many of the cheap goods that can be purchased new.
These two columns are the real meat of the data:
data %>%
head() %>%
select(name, item_description,price)
So far the predictors we’ve explored are all categorical. Although it’s possible to learn a performant classifier on purely categorical features, I think the opportunity to extract continuous features that distinguish between postings with otherwise identical categorical features lies in these two columns.
At the same time, however, these two columns are going to be the most difficult to work with. R lacks a powerful Natural Language Processing toolkit. While Python has tools such as scikit-learn for vectorization, gensim for working with word embeddings, NLTK and SpaCy for part of speach tagging and other miscellaneous NLP tasks, R has no real packages that can compete.
Thus, I’ll do my best to explore these two columns and extract some interesting features, but I may have to swtich over to Python to learn my best classifier. Stay tuned!
I’ve noticed in the sample above that one of the item descriptions is filled with the string “No description yet. How many descriptions are like this one?
data %>%
filter(item_description == "No description yet") %>%
nrow()
## [1] 82489
82,489 - around 5% of the records. To be explicit, I’ll convert these values to N/A values:
data <- data %>%
mutate(item_description = ifelse(item_description == "No description yet", NA, item_description)) %>%
mutate(has.description = !is.na(item_description))
Does it make a difference if the item has a description, for the price?
data %>%
ggplot(aes(x = has.description, y = price, fill = has.description)) +
geom_boxplot() +
coord_flip() +
scale_y_log10()
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 874 rows containing non-finite values (stat_boxplot).
Maybe, but not 100% clear.
data %>%
ggplot(aes(x = has.description, y = price, fill = has.description)) +
geom_boxplot(show.legend = FALSE) +
coord_flip() +
scale_y_log10() +
facet_grid(item_condition_id ~.)
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 874 rows containing non-finite values (stat_boxplot).
data %>%
count(item_condition_id, has.description) %>%
rename(count = n) %>%
group_by(item_condition_id) %>%
mutate(total = sum(count)) %>%
mutate(label = ifelse(has.description, paste(round(count/total,3), "% has\ndesc.", sep = ""), NA)) %>%
ggplot(aes(x = item_condition_id, y = count, fill = has.description, label = label)) +
geom_col(position = "dodge") +
geom_text(vjust = 1) +
labs(title = "Proportion of items that (don't) have an item description")
## Warning: Removed 5 rows containing missing values (geom_text).
We can see that few items don’t have item descriptions, and that it looks like the proportion of items that have item descriptions is consistent amongst items of different values.
Now, I’ll extract a series of basic features about the descriptions/names, such as the word/character length of each description/names
data <- data %>%
mutate(description.length = str_length(item_description),
name.length = str_length(name)) %>%
replace_na(list(description.length = 0, name.length = 0))
p1 = data %>%
ggplot(aes(x = description.length)) +
geom_histogram(bins = 50) +
labs(title = "Description length")
p2 = data %>%
ggplot(aes(x = name.length)) +
geom_histogram(bins = 40) +
labs(title = "Name length")
grid.arrange(p1, p2)
It looks like the mode description length is around 50 words, but people get a lot wordier than that. The length of the posting name also mpeaks at around 40 words, but there is no long tail. This is almost surely because Mercari caps the length of the title at around 40 words, and people try and get in as much information as possible into the title.
Are the lengths of a title and a length of a description correlated?
data %>%
sample_n(200000) %>%
ggplot(aes(x = description.length, y = name.length, color = high_category)) +
geom_point(alpha = .01) +
geom_smooth(se = FALSE) +
labs(title = "Description length vs Name length",
subtitle = "Sample of data - do not interpret this as a linear relationship!")
## `geom_smooth()` using method = 'gam'
cor(data$description.length, data$name.length)
## [1] 0.2502889
It seems that there may be a weak relationship between the length of the description and the name of length of the name.
But how about the price?
corrplot(cor(select(data, price, description.length, name.length)))
Almost no correlation between the description/item length and the price.
Now, a hypothesis: postings of items with big brand names will likely show off about that brand. For example, if I’m selling you a handbag from Coach, I’m going to let you know it’s a Coach bag, and maybe charge a premium for it.
So an interesting feature - how many of the items include the brand name in the title/item description?
# include logical columns which indicate if name/description contain brand name.
data <- data %>%
mutate(title.contains.brand = str_detect(str_to_lower(name), str_to_lower(brand_name)),
description.contains.brand = str_detect(str_to_lower(item_description), str_to_lower(brand_name)))
Does this affect price?
p1 <- data %>%
ggplot(aes(x = title.contains.brand, y = price, fill = title.contains.brand)) +
geom_boxplot() +
scale_y_log10() +
coord_flip()
p2 <- data %>%
count(title.contains.brand) %>%
rename(frequency = n) %>%
ggplot(aes(x = title.contains.brand, y = frequency, fill = title.contains.brand)) +
geom_col()
grid.arrange(p1,p2)
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 874 rows containing non-finite values (stat_boxplot).
Since 42% of the orignal data does not have NA brand_name, the majority of this feature will be NA as well.
Of the remaining items (those that have a brand specified), it looks like those that contain the brand name in the title may have slightly higher prices in the aggregate, but in general the price spread is very similar.
Splitting this up by item category:
data %>%
filter(!is.na(brand_name)) %>%
ggplot(aes(x = title.contains.brand, y = price, fill = title.contains.brand)) +
geom_boxplot() +
scale_y_log10() +
coord_flip() +
facet_grid(high_category~.) +
labs(title = "Prices of postings with/without brand name in title, split by item category")
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 528 rows containing non-finite values (stat_boxplot).
data %>%
filter(!is.na(brand_name)) %>%
group_by(high_category, title.contains.brand) %>%
summarize(frequency = n()) %>%
ggplot(aes(x = title.contains.brand, y = frequency, fill = title.contains.brand)) +
geom_col() +
facet_grid(high_category~.) +
coord_flip() +
labs(title = "Frequency of postings with/without brand name in title, split by category")
It looks like for some categories, whether or not the title contains the brand name has a larger affect on the price than others. For example, for items in the Electronics category, the par between the median price of items that contain the brand name in the title and those that don’t is higher than for other categories (this may be subject to the small number of electronics items in that contain a brand name!).
Boxplots are good for spotting high-level differences, but I’m more interested in if there is a difference in the mechanism that determines the prices of items that contain the brand in the title and those that don’t. For that purpose, I need to see the shape of the data, as well as the summary statistics displayed by boxplots:
data %>%
filter(!is.na(brand_name)) %>%
ggplot(aes(x = price, fill = title.contains.brand)) +
geom_density(position="stack") +
scale_x_log10() +
facet_grid(high_category~.)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 528 rows containing non-finite values (stat_density).
No clear difference in the distributions of prices for items that have the brand name in the title and those that don’t. Further splitting by item condition:
data %>%
filter(!is.na(brand_name)) %>%
ggplot(aes(x = price, fill = title.contains.brand)) +
geom_density(position="stack") +
scale_x_log10() +
facet_grid(high_category~item_condition_id)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 528 rows containing non-finite values (stat_density).
It’s hard to tell because the data starts to get sparse as you partition it twice, but for the most part the shapes of the price distributions for items that have the brand name in the title and those that don’t are roughly the same.
From the chart above, it looks like a few exceptions are:
Going through one by one, I’ll see if these diffrences in distribution shape are substatial, or if they’re just caused by the sparcity of the data:
# a quick function for making the chart that I'm about to make a bunch of times
tmp <- function(category, condition){
p1 <- data %>%
filter(!is.na(brand_name),
high_category == category,
item_condition_id == condition) %>%
ggplot(aes(x = price, fill = title.contains.brand)) +
geom_density(position = "stack", show.legend = FALSE) +
xlim(0, 100) +
labs(title = paste(category, "goods of condition", condition),
subtitle = "Those with brand name nonempty.")
p2 <- data %>%
filter(!is.na(brand_name),
high_category == category,
item_condition_id == condition) %>%
count(title.contains.brand) %>%
rename(frequency = n) %>%
ggplot(aes(x = title.contains.brand, y = frequency, fill = title.contains.brand, label = frequency)) +
geom_col(show.legend = FALSE) +
geom_label(show.legend = FALSE)
grid.arrange(p1,p2,ncol=2)
}
# *Home* goods of condition 5
tmp("Home", 5)
There are very few Home goods of condition 5 with a brand name (15 total), so the distributions are not reliable.
# *Other* products of condition 4
tmp("Other", 4)
## Warning: Removed 2 rows containing non-finite values (stat_density).
Again, the data here is too sparse to make any claims about the difference in the distribution shapes.
tmp("Handmade", 2)
Again sparse data!
I think it’s fair to assume that the differences in the shapes of the price distributions between prices that contain the brand name in the title and those that don’t was only due to the sparcity of the data after I segment it twice. I suspect this feature will not be that important.
How about if the brand name is in the item description?
data %>%
filter(!is.na(brand_name) &
!is.na(item_description)) %>%
ggplot(aes(x = description.contains.brand, y = price, fill = description.contains.brand)) +
geom_boxplot() +
scale_y_log10() +
coord_flip() +
facet_grid(high_category~.) +
labs(title = "Prices of postings with/without brand name in item description, split by item category")
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 488 rows containing non-finite values (stat_boxplot).
The spreads look almost identical. I’m not going to spend too much time trying to determine if this feature is meaningful. I suspect it is not, since whether or not the brand name was contained in the title seemed to make little difference to the price, so it seems reasonable that it would not make a difference if the brand name was in the item description.
Mercari wrote in the challenge description that character squences that look like prices (e.g $20.00) were removed and replaced by the string [rm]. Perhaps the presense of this string will provide some signal:
data %>%
mutate(description.contains.rm = str_detect(item_description,pattern = "[rm]"))